// HTMLParser Library - A java-based parser for HTML
// http://htmlparser.org
// Copyright (C) 2006 Derrick Oswald
//
// Revision Control Information
//
// $URL: https://svn.sourceforge.net/svnroot/htmlparser/trunk/parser/src/main/java/org/htmlparser/filters/RegexFilter.java $
// $Author: derrickoswald $
// $Date: 2006-09-16 10:44:17 -0400 (Sat, 16 Sep 2006) $
// $Revision: 4 $
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the Common Public License; either
// version 1.0 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// Common Public License for more details.
//
// You should have received a copy of the Common Public License
// along with this library; if not, the license is available from
// the Open Source Initiative (OSI) website:
//   http://opensource.org/licenses/cpl1.0.php

package org.htmlparser.filters;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Text;

/**
 * This filter accepts all string nodes matching a regular expression. Because
 * this searches {@link org.htmlparser.Text Text} nodes. it is only useful for
 * finding small fragments of text, where it is unlikely to be broken up by a
 * tag. To find large fragments of text you should convert the page to plain
 * text with something like the
 * {@link org.htmlparser.beans.StringBean StringBean} and then apply the regular
 * expression.
 * <p>
 * For example, to look for dates use:
 * 
 * <pre>
 *   (19|20)\d\d([- \\/.](0[1-9]|1[012])[- \\/.](0[1-9]|[12][0-9]|3[01]))?
 * </pre>
 * 
 * as in:
 * 
 * <pre>
 * Parser parser = new Parser(&quot;http://cbc.ca&quot;);
 * RegexFilter filter = new RegexFilter(&quot;(19|20)\\d\\d([- \\\\/.](0[1-9]|1[012])[- \\\\/.](0[1-9]|[12][0-9]|3[01]))?&quot;);
 * NodeIterator iterator = parser.extractAllNodesThatMatch(filter).elements();
 * </pre>
 * 
 * which matches a date in yyyy-mm-dd format between 1900-01-01 and 2099-12-31,
 * with a choice of five separators, either a dash, a space, either kind of
 * slash or a period. The year is matched by (19|20)\d\d which uses alternation
 * to allow the either 19 or 20 as the first two digits. The round brackets are
 * mandatory. The month is matched by 0[1-9]|1[012], again enclosed by round
 * brackets to keep the two options together. By using character classes, the
 * first option matches a number between 01 and 09, and the second matches 10,
 * 11 or 12. The last part of the regex consists of three options. The first
 * matches the numbers 01 through 09, the second 10 through 29, and the third
 * matches 30 or 31. The day and month are optional, but must occur together
 * because of the ()? bracketing after the year.
 */
public class RegexFilter implements NodeFilter {
	/**
	 * Use match() matching strategy.
	 */
	public static final int MATCH = 1;

	/**
	 * Use lookingAt() match strategy.
	 */
	public static final int LOOKINGAT = 2;

	/**
	 * Use find() match strategy.
	 */
	public static final int FIND = 3;

	/**
	 * The regular expression to search for.
	 */
	protected String mPatternString;

	/**
	 * The compiled regular expression to search for.
	 */
	protected Pattern mPattern;

	/**
	 * The match strategy.
	 * 
	 * @see #RegexFilter(String, int)
	 */
	protected int mStrategy;

	/**
	 * Creates a new instance of RegexFilter that accepts string nodes matching
	 * the regular expression ".*" using the FIND strategy.
	 */
	public RegexFilter() {
		this(".*", FIND);
	}

	/**
	 * Creates a new instance of RegexFilter that accepts string nodes matching
	 * a regular expression using the FIND strategy.
	 * 
	 * @param pattern
	 *            The pattern to search for.
	 */
	public RegexFilter(String pattern) {
		this(pattern, FIND);
	}

	/**
	 * Creates a new instance of RegexFilter that accepts string nodes matching
	 * a regular expression.
	 * 
	 * @param pattern
	 *            The pattern to search for.
	 * @param strategy
	 *            The type of match:
	 *            <ol>
	 *            <li>{@link #MATCH} use matches() method: attempts to match
	 *            the entire input sequence against the pattern</li>
	 *            <li>{@link #LOOKINGAT} use lookingAt() method: attempts to
	 *            match the input sequence, starting at the beginning, against
	 *            the pattern</li>
	 *            <li>{@link #FIND} use find() method: scans the input sequence
	 *            looking for the next subsequence that matches the pattern</li>
	 *            </ol>
	 */
	public RegexFilter(String pattern, int strategy) {
		setPattern(pattern);
		setStrategy(strategy);
	}

	/**
	 * Get the search pattern.
	 * 
	 * @return Returns the pattern.
	 */
	public String getPattern() {
		return (mPatternString);
	}

	/**
	 * Set the search pattern.
	 * 
	 * @param pattern
	 *            The pattern to set.
	 */
	public void setPattern(String pattern) {
		mPatternString = pattern;
		mPattern = Pattern.compile(pattern);
	}

	/**
	 * Get the search strategy.
	 * 
	 * @return Returns the strategy.
	 */
	public int getStrategy() {
		return (mStrategy);
	}

	/**
	 * Set the search pattern.
	 * 
	 * @param strategy
	 *            The strategy to use. One of MATCH, LOOKINGAT or FIND.
	 */
	public void setStrategy(int strategy) {
		if ((strategy != MATCH) && (strategy != LOOKINGAT) && (strategy != FIND))
			throw new IllegalArgumentException("illegal strategy (" + strategy + ")");
		mStrategy = strategy;
	}

	/**
	 * Accept string nodes that match the regular expression.
	 * 
	 * @param node
	 *            The node to check.
	 * @return <code>true</code> if the regular expression matches the text of
	 *         the node, <code>false</code> otherwise.
	 */
	public boolean accept(Node node) {
		String string;
		Matcher matcher;
		boolean ret;

		ret = false;
		if (node instanceof Text) {
			string = ((Text) node).getText();
			matcher = mPattern.matcher(string);
			switch (mStrategy) {
			case MATCH:
				ret = matcher.matches();
				break;
			case LOOKINGAT:
				ret = matcher.lookingAt();
				break;
			case FIND:
			default:
				ret = matcher.find();
				break;
			}
		}

		return (ret);
	}
}
